import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # setting seaborn default for plots
print("Setup Complete")
Setup Complete
import os
os.listdir()
['tv_shows.csv', '.DS_Store', '.jovianrc', '.ipynb_checkpoints', 'streaming.ipynb']
raw_tv_shows_df = pd.read_csv('./tv_shows.csv', index_col=0)
raw_tv_shows_df.head()
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Breaking Bad | 2008 | 18+ | 9.5 | 96% | 1 | 0 | 0 | 0 | 1 |
| 1 | Stranger Things | 2016 | 16+ | 8.8 | 93% | 1 | 0 | 0 | 0 | 1 |
| 2 | Money Heist | 2017 | 18+ | 8.4 | 91% | 1 | 0 | 0 | 0 | 1 |
| 3 | Sherlock | 2010 | 16+ | 9.1 | 78% | 1 | 0 | 0 | 0 | 1 |
| 4 | Better Call Saul | 2015 | 18+ | 8.7 | 97% | 1 | 0 | 0 | 0 | 1 |
tv_shows_df = raw_tv_shows_df.copy()
tv_shows_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5611 entries, 0 to 5610 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 5611 non-null object 1 Year 5611 non-null int64 2 Age 3165 non-null object 3 IMDb 4450 non-null float64 4 Rotten Tomatoes 1011 non-null object 5 Netflix 5611 non-null int64 6 Hulu 5611 non-null int64 7 Prime Video 5611 non-null int64 8 Disney+ 5611 non-null int64 9 type 5611 non-null int64 dtypes: float64(1), int64(6), object(3) memory usage: 482.2+ KB
tv_shows_df.describe()
| Year | IMDb | Netflix | Hulu | Prime Video | Disney+ | type | |
|---|---|---|---|---|---|---|---|
| count | 5611.000000 | 4450.000000 | 5611.000000 | 5611.000000 | 5611.000000 | 5611.000000 | 5611.0 |
| mean | 2011.021030 | 7.113258 | 0.344145 | 0.312600 | 0.382107 | 0.032080 | 1.0 |
| std | 11.005116 | 1.132060 | 0.475131 | 0.463594 | 0.485946 | 0.176228 | 0.0 |
| min | 1901.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
| 25% | 2010.000000 | 6.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
| 50% | 2015.000000 | 7.300000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
| 75% | 2017.000000 | 7.900000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.0 |
| max | 2020.000000 | 9.600000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.0 |
tv_shows_df['Netflix'].sum()
1931
tv_shows_df['Hulu'].sum()
1754
tv_shows_df['Prime Video'].sum()
2144
tv_shows_df['Disney+'].sum()
180
Percent of shows that are 18+
(tv_shows_df['Age'] =='18+').sum()/tv_shows_df['Age'].count() *100
23.696682464454977
tv_shows_df['Age'].value_counts().sum()
3165
tv_shows_df['Age'].value_counts()
16+ 1018 7+ 848 18+ 750 all 545 13+ 4 Name: Age, dtype: int64
age_df = tv_shows_df['Age'].copy()
age_df['all'] = (age_df =='all').sum()/age_df.count() *100
age_df.all
<bound method Series.all of 0 18+
1 16+
2 18+
3 16+
4 18+
...
5607 NaN
5608 NaN
5609 NaN
5610 NaN
all 17.2196
Name: Age, Length: 5612, dtype: object>
age_df
0 18+
1 16+
2 18+
3 16+
4 18+
...
5607 NaN
5608 NaN
5609 NaN
5610 NaN
all 17.2196
Name: Age, Length: 5612, dtype: object
tv_shows_df['Age']
0 18+
1 16+
2 18+
3 16+
4 18+
...
5606 NaN
5607 NaN
5608 NaN
5609 NaN
5610 NaN
Name: Age, Length: 5611, dtype: object
Per example, importing more libraries: https://www.kaggle.com/sreshta140/a-quick-analysis-on-tv-shows
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objs as go
Converting Rotten Tomatoes score to a float
tv_shows_df['Rotten Tomatoes'] = tv_shows_df['Rotten Tomatoes'].str.replace('%', '')
tv_shows_df['Rotten Tomatoes'] = tv_shows_df['Rotten Tomatoes'].astype('float')
tv_shows_df['Rotten Tomatoes'].head()
0 96.0 1 93.0 2 91.0 3 78.0 4 97.0 Name: Rotten Tomatoes, dtype: float64
Correlation Matrix
corrmat = tv_shows_df.corr()
fig = plt.figure(figsize = (12, 9))
sns.heatmap(corrmat, vmax = .8, square = True, annot = True)
plt.show()
Finding top of IMBD:
top_imdb = tv_shows_df.sort_values(by=['IMDb'], ascending=False)
top_10_imdb = top_imdb.head(10)
top_10_imdb
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3023 | Destiny | 2014 | NaN | 9.6 | NaN | 0 | 1 | 0 | 0 | 1 |
| 0 | Breaking Bad | 2008 | 18+ | 9.5 | 96.0 | 1 | 0 | 0 | 0 | 1 |
| 3747 | Malgudi Days | 1987 | all | 9.5 | NaN | 0 | 0 | 1 | 0 | 1 |
| 3177 | Hungry Henry | 2014 | NaN | 9.5 | NaN | 0 | 1 | 0 | 0 | 1 |
| 3567 | Band of Brothers | 2001 | 18+ | 9.4 | 94.0 | 0 | 0 | 1 | 0 | 1 |
| 2365 | The Joy of Painting | 1983 | all | 9.4 | NaN | 0 | 1 | 1 | 0 | 1 |
| 4128 | Green Paradise | 2011 | all | 9.3 | NaN | 0 | 0 | 1 | 0 | 1 |
| 91 | Our Planet | 2019 | 7+ | 9.3 | 93.0 | 1 | 0 | 0 | 0 | 1 |
| 3566 | The Wire | 2002 | 18+ | 9.3 | 94.0 | 0 | 0 | 1 | 0 | 1 |
| 325 | Ramayan | 1987 | all | 9.3 | NaN | 1 | 0 | 0 | 0 | 1 |
Of these top 10 IMDb scores, how many are on streaming? How many on multiple streaming?
import plotly.express as px
fig = px.bar(top_10_imdb, y='IMDb', x='Title', color='IMDb')
fig.show()
How many movies were released each year?
yr = tv_shows_df['Year'].value_counts()
yr
2017 653
2016 573
2018 556
2015 454
2019 396
...
1949 1
1934 1
1931 1
1943 1
1904 1
Name: Year, Length: 81, dtype: int64
yr = pd.DataFrame(yr)
yr=yr.reset_index()
yr=yr.rename(columns={'Year':'Count'})
yr=yr.rename(columns={'index': 'Year'})
import plotly.express as px
fig=px.bar(yr, x='Year', y='Count', color='Count')
fig.show()
age = tv_shows_df['Age'].value_counts()
age
16+ 1018 7+ 848 18+ 750 all 545 13+ 4 Name: Age, dtype: int64
age=age.reset_index()
age=age.rename(columns={'Age':'Count'})
age=age.rename(columns={'index': 'Age'})
fig=px.bar(age, x='Age', y='Count', color='Count')
fig.show()
fig = px.histogram(tv_shows_df, x='Rotten Tomatoes', nbins=100, opacity=0.8, color_discrete_sequence=['darkkhaki'])
fig.show()
analyzing the services:
netflix=tv_shows_df[tv_shows_df['Netflix']==1]
hulu=tv_shows_df[tv_shows_df['Hulu']==1]
prime=tv_shows_df[tv_shows_df['Prime Video']==1]
disney=tv_shows_df[tv_shows_df['Disney+']==1]
channels=[netflix, hulu, prime, disney]
cols=['Year', 'Age', 'IMDb']
tv_shows_list=[]
k = 0
for i in channels:
col_list=[]
for j in cols:
a = i[j].value_counts()
a = pd.DataFrame(a)
a = a.reset_index()
a = a.rename(columns={j: 'Count'})
a = a.rename(columns={'index': j})
col_list.append(a)
tv_shows_list.append(col_list)
disney.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 180 entries, 472 to 5610 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 180 non-null object 1 Year 180 non-null int64 2 Age 150 non-null object 3 IMDb 169 non-null float64 4 Rotten Tomatoes 24 non-null float64 5 Netflix 180 non-null int64 6 Hulu 180 non-null int64 7 Prime Video 180 non-null int64 8 Disney+ 180 non-null int64 9 type 180 non-null int64 dtypes: float64(2), int64(6), object(2) memory usage: 15.5+ KB
Years of movies in each service library:
import plotly.offline as pyoff
plot_data = [
go.Bar(x=tv_shows_list[0][0]['Year'], y=tv_shows_list[0][0]['Count'], name='Netflix'),
go.Bar(x=tv_shows_list[1][0]['Year'], y=tv_shows_list[1][0]['Count'], name='Hulu'),
go.Bar(x=tv_shows_list[2][0]['Year'], y=tv_shows_list[2][0]['Count'], name='Prime Video'),
go.Bar(x=tv_shows_list[3][0]['Year'], y=tv_shows_list[3][0]['Count'], name='Disney+'),
]
plot_layout = go.Layout(
title='Year',
yaxis_title='Count',
xaxis_title='Year'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
plot_data = [
go.Bar(x=tv_shows_list[0][1]['Age'], y=tv_shows_list[0][1]['Count'], name='Netflix'),
go.Bar(x=tv_shows_list[1][1]['Age'], y=tv_shows_list[1][1]['Count'], name='Hulu'),
go.Bar(x=tv_shows_list[2][1]['Age'], y=tv_shows_list[2][1]['Count'], name='Prime Video'),
go.Bar(x=tv_shows_list[3][1]['Age'], y=tv_shows_list[3][1]['Count'], name='Disney+'),
]
plot_layout = go.Layout(
title='Age',
yaxis_title='Count',
xaxis_title='Age'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
import plotly.offline as pyoff
plot_data = [
go.Bar(x=tv_shows_list[0][2]['IMDb'], y=tv_shows_list[0][2]['Count'], name='Netflix'),
go.Bar(x=tv_shows_list[1][2]['IMDb'], y=tv_shows_list[1][2]['Count'], name='Hulu'),
go.Bar(x=tv_shows_list[2][2]['IMDb'], y=tv_shows_list[2][2]['Count'], name='Prime Video'),
go.Bar(x=tv_shows_list[3][2]['IMDb'], y=tv_shows_list[3][2]['Count'], name='Disney+'),
]
plot_layout = go.Layout(
title='IMDb',
yaxis_title='Count',
xaxis_title='IMDb'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
plot_data = [
go.Histogram(x=netflix["Rotten Tomatoes"], name='Netflix'),
go.Histogram(x=hulu["Rotten Tomatoes"], name='Hulu'),
go.Histogram(x=prime["Rotten Tomatoes"], name='Prime Video'),
go.Histogram(x=disney["Rotten Tomatoes"], name='Disney+')
]
plot_layout = go.Layout(
title='Rotten Tomatoes',
yaxis_title='Count',
xaxis_title='Rotten Tomatoes'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Separating viewing platforms
netflix_shows = tv_shows_df.loc[tv_shows_df['Netflix']==1]
hulu_shows = tv_shows_df.loc[tv_shows_df['Hulu']==1]
prime_shows = tv_shows_df.loc[tv_shows_df['Prime Video']==1]
disney_shows = tv_shows_df.loc[tv_shows_df['Disney+']==1]
#dropping the unwanted columns
netflix_shows = netflix_shows.drop(['Hulu',
'Prime Video', 'Disney+', 'type'], axis=1)
hulu_shows = hulu_shows.drop(['Netflix',
'Prime Video', 'Disney+', 'type'], axis=1)
prime_shows = prime_shows.drop(['Netflix', 'Hulu',
'Disney+', 'type'], axis=1)
disney_shows = disney_shows.drop(['Netflix', 'Hulu',
'Prime Video', 'type'], axis=1)
netflix_shows.columns
Index(['Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix'], dtype='object')
hulu_shows.head()
| Title | Year | Age | IMDb | Rotten Tomatoes | Hulu | |
|---|---|---|---|---|---|---|
| 13 | Attack on Titan | 2013 | 16+ | 8.8 | 94.0 | 1 |
| 15 | Fullmetal Alchemist: Brotherhood | 2009 | 18+ | 9.1 | 100.0 | 1 |
| 16 | Community | 2009 | 7+ | 8.5 | 88.0 | 1 |
| 18 | Parks and Recreation | 2009 | 16+ | 8.6 | 93.0 | 1 |
| 22 | Twin Peaks | 1990 | 18+ | 8.8 | 89.0 | 1 |
index_netflix = netflix_shows.index
total_netflix = len(index_netflix)
index_hulu = hulu_shows.index
total_hulu = len(index_hulu)
index_prime = prime_shows.index
total_prime = len(index_prime)
index_disney = disney_shows.index
total_disney = len(index_disney)
#Pie Chart
labels = 'Netflix', 'Hulu', 'Prime Video', 'Disney+'
sizes = [total_netflix, total_hulu, total_prime, total_disney]
explode = (0.1, 0.1, 0.1, 0.1)
fig1, ax1 = plt.subplots()
ax1.pie(sizes,
explode = explode,
labels = labels,
autopct = '%1.1f%%',
shadow = True,
startangle = 100)
ax1.axis('equal')
plt.show()
8+ ratings on IMDb
rate_show_netflix = netflix_shows['IMDb']>8
print("Total Movies on Netflix with more than 8+ rating(IMDb):", rate_show_netflix.sum())
rate_show_hulu = hulu_shows['IMDb']>8
print("Total Movies on Hulu with more than 8+ rating(IMDb):", rate_show_hulu.sum())
rate_show_prime = prime_shows['IMDb']>8
print("Total Movies on Prime Video with more than 8+ rating(IMDb):", rate_show_prime.sum())
rate_show_disney = disney_shows['IMDb']>8
print("Total Movies on Disney+ with more than 8+ rating(IMDb):", rate_show_disney.sum())
Total Movies on Netflix with more than 8+ rating(IMDb): 383 Total Movies on Hulu with more than 8+ rating(IMDb): 279 Total Movies on Prime Video with more than 8+ rating(IMDb): 293 Total Movies on Disney+ with more than 8+ rating(IMDb): 30
top_rated = [rate_show_netflix.sum(), rate_show_hulu.sum(), rate_show_prime.sum(), rate_show_disney.sum()]
top_plat = ['Netflix', 'Disney', 'Prime Video', 'Hulu']
top_rated_data = pd.DataFrame({
'platforms': ['Netflix',
'Disney',
'Prime Video',
'Hulu'],
'total_show': [rate_show_netflix.sum(),
rate_show_disney.sum(),
rate_show_prime.sum(),
rate_show_hulu.sum()]
})
plt.figure(figsize=(10,10))
sns.barplot(data = top_rated_data,
x = top_rated_data['platforms'],
y = top_rated_data['total_show']
)
plt.ylabel('Platform')
plt.xlabel('Total number of 8+ rated shows')
plt.title('Platform with most tv show rated above 8+ (IMDb)')
plt.show()
Rotten Tomatoes Rating:
rt_show_netflix = netflix_shows['Rotten Tomatoes']>=80
print("Total Movies on Netflix with more than 80 rating(Rotten Tomatoes):", rt_show_netflix.sum())
rt_show_hulu = hulu_shows['Rotten Tomatoes']>=80
print("Total Movies on Hulu with more than 80 rating(IMDb):", rt_show_hulu.sum())
rt_show_prime = prime_shows['Rotten Tomatoes']>=80
print("Total Movies on Prime Video with more than 80 rating(Rotten Tomatoes):", rt_show_prime.sum())
rt_show_disney = disney_shows['Rotten Tomatoes']>=80
print("Total Movies on Disney+ with more than 80 rating(Rotten Tomatoes):", rt_show_disney.sum())
Total Movies on Netflix with more than 80 rating(Rotten Tomatoes): 278 Total Movies on Hulu with more than 80 rating(IMDb): 229 Total Movies on Prime Video with more than 80 rating(Rotten Tomatoes): 129 Total Movies on Disney+ with more than 80 rating(Rotten Tomatoes): 17
rt_top_rated = [rt_show_netflix.sum(), rt_show_hulu.sum(), rt_show_prime.sum(), rt_show_disney.sum()]
rt_top_plat = ['Netflix', 'Disney', 'Prime Video', 'Hulu']
rt_top_rated_data = pd.DataFrame({
'platforms': ['Netflix',
'Disney',
'Prime Video',
'Hulu'],
'total_show': [rt_show_netflix.sum(),
rt_show_disney.sum(),
rt_show_prime.sum(),
rt_show_hulu.sum()]
})
plt.figure(figsize=(10,10))
sns.barplot(data = rt_top_rated_data,
x = rt_top_rated_data['platforms'],
y = rt_top_rated_data['total_show']
)
plt.ylabel('Platform')
plt.xlabel('Total number of 80+ Rotten Tomatoes ratings')
plt.title('Platform with most tv show rated above 80+ (Rotten Tomatoes)')
plt.show()
age_df2 = tv_shows_df.groupby('Age').groups
age_df2 = tv_shows_df.groupby('Age').sum()
age_df2[['Netflix', 'Hulu', 'Prime Video', 'Disney+']]
| Netflix | Hulu | Prime Video | Disney+ | |
|---|---|---|---|---|
| Age | ||||
| 13+ | 3 | 0 | 1 | 0 |
| 16+ | 398 | 514 | 209 | 3 |
| 18+ | 359 | 239 | 182 | 0 |
| 7+ | 300 | 365 | 224 | 66 |
| all | 171 | 159 | 192 | 81 |
sns.catplot(y='Age', kind='count', palette = 'pastel', edgecolor = '.6',
data = tv_shows_df)
<seaborn.axisgrid.FacetGrid at 0x7f835bbee350>
Netflix count by age:
trace1 = go.Bar(x=age_df2.axes[0], y=age_df2['Netflix'], name='Netflix',
width=[0.5,0.5,0.5,0.5,0.5,])
x = [trace1]
layout = go.Layout(title='Count of Netflix content by age', legend=dict(x=0.1, y=0.1))
fig = go.Figure(x, layout=layout)
fig.update_traces(marker_color='rgb(102,216,23)', marker_line_color='rgb(87,96,107)',
marker_line_width=1.5, opacity=0.6)
fig.show()
#HULU
trace2 = go.Bar(x=age_df2.axes[0], y=age_df2['Hulu'], name='Hulu',
width=[0.5,0.5,0.5,0.5,0.5,])
x = [trace2]
layout = go.Layout(title='Count of Hulu content by age', legend=dict(x=0.1, y=0.1))
fig = go.Figure(x, layout=layout)
fig.update_traces(marker_color='rgb(102,216,23)', marker_line_color='rgb(87,96,107)',
marker_line_width=1.5, opacity=0.6)
fig.show()
#PRIME
trace3 = go.Bar(x=age_df2.axes[0], y=age_df2['Prime Video'], name='Prime Video',
width=[0.5,0.5,0.5,0.5,0.5,])
x = [trace3]
layout = go.Layout(title='Count of Prime content by age', legend=dict(x=0.1, y=0.1))
fig = go.Figure(x, layout=layout)
fig.update_traces(marker_color='rgb(102,216,23)', marker_line_color='rgb(87,96,107)',
marker_line_width=1.5, opacity=0.6)
fig.show()
#DISNEY
trace4 = go.Bar(x=age_df2.axes[0], y=age_df2['Disney+'], name='Disney+',
width=[0.5,0.5,0.5,0.5,0.5,])
x = [trace4]
layout = go.Layout(title='Count of Netflix content by age', legend=dict(x=0.1, y=0.1))
fig = go.Figure(x, layout=layout)
fig.update_traces(marker_color='rgb(102,216,23)', marker_line_color='rgb(87,96,107)',
marker_line_width=1.5, opacity=0.6)
fig.show()
How would I stack this data on top of each other?
By year:
ax = sns.catplot(x='Year',kind='count',data=tv_shows_df,orient="h",height=30,aspect=2)
ax.fig.suptitle('Number of TV series per year')
ax.fig.autofmt_xdate()
DF Form:
year_df = tv_shows_df.groupby('Year')
year_sum =tv_shows_df.groupby('Year').sum()
year_sum = year_sum[['Netflix', 'Hulu', 'Prime Video', 'Disney+']]
Top production years:
year_sum.columns
Index(['Netflix', 'Hulu', 'Prime Video', 'Disney+'], dtype='object')
year_sum.iloc[0].value_counts()
0 3 2 1 Name: 1901, dtype: int64
year_sum['Year'] = year_sum.index
year_sum['Year']
Year
1901 1901
1904 1904
1914 1914
1931 1931
1932 1932
...
2016 2016
2017 2017
2018 2018
2019 2019
2020 2020
Name: Year, Length: 81, dtype: int64
year_sum['Year'].describe()
count 81.000000 mean 1978.148148 std 27.260370 min 1901.000000 25% 1960.000000 50% 1980.000000 75% 2000.000000 max 2020.000000 Name: Year, dtype: float64
year_sum.Year.sort_values()
Year
1901 1901
1904 1904
1914 1914
1931 1931
1932 1932
...
2016 2016
2017 2017
2018 2018
2019 2019
2020 2020
Name: Year, Length: 81, dtype: int64
year_sum[0:3].sort_index(ascending=False)
| Netflix | Hulu | Prime Video | Disney+ | Year | |
|---|---|---|---|---|---|
| Year | |||||
| 1914 | 1 | 0 | 0 | 0 | 1914 |
| 1904 | 0 | 0 | 1 | 0 | 1904 |
| 1901 | 0 | 0 | 2 | 0 | 1901 |
year_sum.columns
Index(['Netflix', 'Hulu', 'Prime Video', 'Disney+', 'Year'], dtype='object')
year_sum.drop(columns='Year', inplace=True)
recent_year_sum = year_sum.sort_index(ascending=False).head(15)
recent_year_sum
| Netflix | Hulu | Prime Video | Disney+ | |
|---|---|---|---|---|
| Year | ||||
| 2020 | 110 | 41 | 18 | 8 |
| 2019 | 233 | 109 | 36 | 22 |
| 2018 | 272 | 139 | 155 | 13 |
| 2017 | 221 | 132 | 305 | 19 |
| 2016 | 219 | 134 | 231 | 10 |
| 2015 | 180 | 132 | 161 | 9 |
| 2014 | 137 | 128 | 125 | 8 |
| 2013 | 92 | 111 | 111 | 7 |
| 2012 | 88 | 106 | 119 | 12 |
| 2011 | 80 | 93 | 92 | 6 |
| 2010 | 54 | 80 | 90 | 5 |
| 2009 | 44 | 61 | 67 | 4 |
| 2008 | 29 | 44 | 62 | 4 |
| 2007 | 26 | 39 | 43 | 3 |
| 2006 | 21 | 48 | 38 | 6 |
year_sum.sort_values(by='Netflix', ascending=False).head(10)
| Netflix | Hulu | Prime Video | Disney+ | |
|---|---|---|---|---|
| Year | ||||
| 2018 | 272 | 139 | 155 | 13 |
| 2019 | 233 | 109 | 36 | 22 |
| 2017 | 221 | 132 | 305 | 19 |
| 2016 | 219 | 134 | 231 | 10 |
| 2015 | 180 | 132 | 161 | 9 |
| 2014 | 137 | 128 | 125 | 8 |
| 2020 | 110 | 41 | 18 | 8 |
| 2013 | 92 | 111 | 111 | 7 |
| 2012 | 88 | 106 | 119 | 12 |
| 2011 | 80 | 93 | 92 | 6 |
year_sum['Sum'] = year_sum.sum(axis=1)
year_sum
| Netflix | Hulu | Prime Video | Disney+ | Sum | |
|---|---|---|---|---|---|
| Year | |||||
| 1901 | 0 | 0 | 2 | 0 | 2 |
| 1904 | 0 | 0 | 1 | 0 | 1 |
| 1914 | 1 | 0 | 0 | 0 | 1 |
| 1931 | 0 | 1 | 0 | 0 | 1 |
| 1932 | 0 | 0 | 1 | 0 | 1 |
| ... | ... | ... | ... | ... | ... |
| 2016 | 219 | 134 | 231 | 10 | 594 |
| 2017 | 221 | 132 | 305 | 19 | 677 |
| 2018 | 272 | 139 | 155 | 13 | 579 |
| 2019 | 233 | 109 | 36 | 22 | 400 |
| 2020 | 110 | 41 | 18 | 8 | 177 |
81 rows × 5 columns
year_sum.sort_values(by='Sum', ascending=False).head(10)
| Netflix | Hulu | Prime Video | Disney+ | Sum | |
|---|---|---|---|---|---|
| Year | |||||
| 2017 | 221 | 132 | 305 | 19 | 677 |
| 2016 | 219 | 134 | 231 | 10 | 594 |
| 2018 | 272 | 139 | 155 | 13 | 579 |
| 2015 | 180 | 132 | 161 | 9 | 482 |
| 2019 | 233 | 109 | 36 | 22 | 400 |
| 2014 | 137 | 128 | 125 | 8 | 398 |
| 2012 | 88 | 106 | 119 | 12 | 325 |
| 2013 | 92 | 111 | 111 | 7 | 321 |
| 2011 | 80 | 93 | 92 | 6 | 271 |
| 2010 | 54 | 80 | 90 | 5 | 229 |
Finally, top 10 years of show releases.
sns.lineplot(data=year_sum['Sum']);
sns.lineplot(data=year_sum['Netflix']);
sns.lineplot(data=year_sum['Prime Video']);
recent = year_sum.copy()
recent = year_sum[year_sum.index >= 2000]
recent.drop(columns='Sum', inplace=True)
recent
| Netflix | Hulu | Prime Video | Disney+ | |
|---|---|---|---|---|
| Year | ||||
| 2000 | 6 | 14 | 21 | 3 |
| 2001 | 7 | 19 | 28 | 3 |
| 2002 | 12 | 18 | 21 | 1 |
| 2003 | 11 | 20 | 25 | 2 |
| 2004 | 10 | 28 | 44 | 2 |
| 2005 | 15 | 43 | 49 | 2 |
| 2006 | 21 | 48 | 38 | 6 |
| 2007 | 26 | 39 | 43 | 3 |
| 2008 | 29 | 44 | 62 | 4 |
| 2009 | 44 | 61 | 67 | 4 |
| 2010 | 54 | 80 | 90 | 5 |
| 2011 | 80 | 93 | 92 | 6 |
| 2012 | 88 | 106 | 119 | 12 |
| 2013 | 92 | 111 | 111 | 7 |
| 2014 | 137 | 128 | 125 | 8 |
| 2015 | 180 | 132 | 161 | 9 |
| 2016 | 219 | 134 | 231 | 10 |
| 2017 | 221 | 132 | 305 | 19 |
| 2018 | 272 | 139 | 155 | 13 |
| 2019 | 233 | 109 | 36 | 22 |
| 2020 | 110 | 41 | 18 | 8 |
sns.lineplot(data=recent);
sns.lineplot(data=recent['Netflix']);
#alternatively:
x=recent.axes[0]
y=recent["Netflix"]
plt.figure(figsize=(12,8))
plt.plot(x,y,linestyle='solid',label="count of TV shows")
plt.xticks(x, x, rotation=75)
plt.ylabel("Count")
plt.xlabel("Years")
plt.legend()
plt.title("Count of Netflix TV shows by Year")
plt.grid(color='b', linestyle='dotted', linewidth=2)
plt.show()
This route created a dataframe of only Sherlock
sherlock = tv_shows_df.loc[tv_shows_df['Title']=='Sherlock']
sherlock
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3 | Sherlock | 2010 | 16+ | 9.1 | 78.0 | 1 | 0 | 0 | 0 | 1 |
df.loc[df[‘column name’] condition]
This route just outputs the row.
tv_shows_df.loc[tv_shows_df['Title']=='Sherlock']
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3 | Sherlock | 2010 | 16+ | 9.1 | 78.0 | 1 | 0 | 0 | 0 | 1 |
tv_shows_df.head()
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Breaking Bad | 2008 | 18+ | 9.5 | 96.0 | 1 | 0 | 0 | 0 | 1 |
| 1 | Stranger Things | 2016 | 16+ | 8.8 | 93.0 | 1 | 0 | 0 | 0 | 1 |
| 2 | Money Heist | 2017 | 18+ | 8.4 | 91.0 | 1 | 0 | 0 | 0 | 1 |
| 3 | Sherlock | 2010 | 16+ | 9.1 | 78.0 | 1 | 0 | 0 | 0 | 1 |
| 4 | Better Call Saul | 2015 | 18+ | 8.7 | 97.0 | 1 | 0 | 0 | 0 | 1 |